Load statistics
read_stats <- read_tsv(c("https://sid.erda.dk/share_redirect/D9oEqxuxql/reports/by_step/reads_data/multiqc_general_stats.txt",
"https://sid.erda.dk/share_redirect/e0EsEFnsKu/reports/by_step/reads_data/multiqc_general_stats.txt")) %>%
mutate(Sample = str_extract(Sample, "M\\d+")) %>%
rename_with(~str_remove(., "FastQC_mqc-generalstats-fastqc-"), starts_with("FastQC_mqc-generalstats-fastqc-")) %>%
rename(microsample=Sample) %>%
select(microsample,percent_duplicates,total_sequences, percent_gc) %>%
group_by(microsample) %>%
summarise(total_sequences=sum(total_sequences), percent_duplicates=mean(percent_duplicates), percent_gc=mean(percent_gc))
host_mapping_stats <- read_tsv(c("https://sid.erda.dk/share_redirect/b9ZbTPY0kQ/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt",
"https://sid.erda.dk/share_redirect/e0EsEFnsKu/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt")) %>%
mutate(reference = case_when(
grepl("GRCh38", Sample, ignore.case = TRUE) ~ "human",
grepl("GRCg7b", Sample, ignore.case = TRUE) ~ "chicken",
TRUE ~ NA )) %>%
filter(reference=="chicken") %>%
mutate(Sample = str_extract(Sample, "M\\d+")) %>%
rename(microsample=Sample,reads_mapped_host=mapped_passed,reads_mapped_host_percent=mapped_passed_pct) %>%
select(microsample,reads_mapped_host,reads_mapped_host_percent) %>%
group_by(microsample) %>%
summarise(reads_mapped_host=sum(reads_mapped_host),reads_mapped_host_percent=mean(reads_mapped_host_percent))
human_mapping_stats <- read_tsv(c("https://sid.erda.dk/share_redirect/b9ZbTPY0kQ/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt",
"https://sid.erda.dk/share_redirect/e0EsEFnsKu/reports/by_step/preprocess_data/multiqc_samtools_flagstat.txt")) %>%
mutate(reference = case_when(
grepl("GRCh38", Sample, ignore.case = TRUE) ~ "human",
grepl("GRCg7b", Sample, ignore.case = TRUE) ~ "chicken",
TRUE ~ NA )) %>%
filter(reference=="human") %>%
mutate(Sample = str_extract(Sample, "M\\d+")) %>%
rename(microsample=Sample, reads_mapped_human=mapped_passed,reads_mapped_human_percent=mapped_passed_pct) %>%
select(microsample,reads_mapped_human,reads_mapped_human_percent) %>%
group_by(microsample) %>%
summarise(reads_mapped_human=sum(reads_mapped_human),reads_mapped_human_percent=mean(reads_mapped_human_percent))
quantification_stats <- read_tsv(c(
"https://sid.erda.dk/share_redirect/D9oEqxuxql/reports/by_step/quantify_data/multiqc_general_stats.txt",
"https://sid.erda.dk/share_redirect/e0EsEFnsKu/reports/by_step/quantify_data/multiqc_general_stats.txt")) %>%
filter(str_detect(Sample, "mgg-pbdrep")) %>%
mutate(Sample = str_extract(Sample, "M\\d+")) %>%
rename_with(~str_remove(., "Samtools: stats_mqc-generalstats-samtools_stats-"),
starts_with("Samtools: stats_mqc-generalstats-samtools_stats-")) %>%
rename_with(~str_remove(., "Samtools: flagstat_mqc-generalstats-samtools_flagstat-"),
starts_with("Samtools: flagstat_mqc-generalstats-samtools_flagstat-")) %>%
rename(microsample=Sample) %>%
group_by(microsample) %>%
summarise(reads_mapped=sum(reads_mapped),reads_mapped_percent=mean(reads_mapped_percent))
quality_stats <- read_stats %>%
left_join(host_mapping_stats, by=join_by(microsample==microsample)) %>%
left_join(human_mapping_stats, by=join_by(microsample==microsample)) %>%
left_join(quantification_stats, by=join_by(microsample==microsample))
Quality flagging
quality <- quality_stats %>%
mutate(depth = case_when(
total_sequences <= 10000000 ~ 0,
total_sequences > 10000000 ~ 1,
TRUE ~ NA)) %>%
mutate(duplicates = case_when(
percent_duplicates >= 65 ~ 0,
percent_duplicates < 65 ~ 1,
TRUE ~ NA)) %>%
mutate(gc = case_when(
percent_gc >= 55 ~ 0,
percent_gc < 55 ~ 1,
TRUE ~ NA)) %>%
mutate(human = case_when(
reads_mapped_human_percent >= 5 ~ 0,
reads_mapped_human_percent < 5 ~ 1,
TRUE ~ NA)) %>%
mutate(bacteria = case_when(
reads_mapped_percent <= 60 ~ 0,
reads_mapped_percent > 60 ~ 1,
TRUE ~ NA)) %>%
select(microsample, depth, duplicates, gc, human, bacteria) %>%
mutate(quality = depth + duplicates + gc + human + bacteria) %>%
select(microsample, quality)
quality %>% write_tsv("results/quality.tsv")
Quality overview
quality %>%
left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
filter(section != "Ileum") %>%
ggplot(aes(x=quality,y=microsample,fill=protocol))+
geom_col()+
scale_fill_manual(values=c("#6375ad","#d1b454","#d6885e")) +
geom_vline(xintercept=5, linetype="dashed", color = "red", size=1) +
facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
theme(strip.text.y.left = element_text(angle = 0)) +
labs(x="Quality score", y="Microsamples", fill="Library protocol")

quality %>%
left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
filter(section != "Ileum") %>%
filter(type == "Positive") %>%
group_by(protocol) %>%
summarise(average=mean(quality, na.rm=TRUE), percentage_5 = mean(quality == 5, na.rm = TRUE) * 100) %>%
tt()
tinytable_ks13y9vnyr5wl9i4knk4
| protocol |
average |
percentage_5 |
| full_celero |
3.857143 |
57.14286 |
| full_ulv2 |
4.500000 |
77.77778 |
| half_ulv2 |
4.500000 |
85.71429 |